Metadata basics
First do the plots with kruskall wallis comparison between groups to
see if there’s an overall difference, then do the pairwise tests because
KW says there is a difference.
metadata <- read_metadata(metadata_handle)
metadata <- metadata %>% mutate(Group = if_else(Group == 'Control_HealthySerosurvey', 'Healthy Control', Group))
metadata <- metadata %>% mutate(Group = if_else(Group == 'Acute_Typhi', 'Acute typhoid', Group))
number_per_country <- metadata %>% group_by(Country) %>% summarise(count = n())
number_per_country %>% kbl() %>% kable_styling()
|
Country
|
count
|
|
Bangladesh
|
120
|
|
Malawi
|
114
|
|
Nepal
|
76
|
print(sum(number_per_country$count))
## [1] 310
metadata %>% group_by(Country, Group) %>% summarise(count = n()) %>% kbl() %>% kable_styling()
|
Country
|
Group
|
count
|
|
Bangladesh
|
Acute typhoid
|
40
|
|
Bangladesh
|
Carrier
|
40
|
|
Bangladesh
|
Healthy Control
|
40
|
|
Malawi
|
Acute typhoid
|
34
|
|
Malawi
|
Carrier
|
40
|
|
Malawi
|
Healthy Control
|
40
|
|
Nepal
|
Acute typhoid
|
29
|
|
Nepal
|
Carrier
|
30
|
|
Nepal
|
Healthy Control
|
17
|
metadata %>% group_by(Group) %>% summarise(count = n()) %>% kbl() %>% kable_styling()
|
Group
|
count
|
|
Acute typhoid
|
103
|
|
Carrier
|
110
|
|
Healthy Control
|
97
|
baseline_chars <- get_baseline_characteristics(metadata)
# baseline_chars$pct_anti
# baseline_chars$pct_anti %>% na_if(0)
# BEWARE - na_if(0.0) will convert ALL 0s to NAs, this is ok as they're only in the antibiotics of carriers and controls
# at the moment, but if others get added in, will screw with it.
# baseline_chars %>% rename(`Median age` = median_age, `Women (%)` = pct_fem, `Antibiotics in last 2 weeks (%)` = pct_anti) %>% pivot_longer(!c(Country, Group)) %>% rename(variable_name = name) %>% pivot_wider(names_from = Group, values_from = value) %>% filter(variable_name != 'number') %>% na_if(0.0) %>% kbl(digits = c(NA, NA, 1, 1, 1)) %>% kable_styling()
# na_if stopped working, so had to do this instead
baseline_chars_table <- baseline_chars %>% rename(`Median age` = median_age, `Women (%)` = pct_fem, `Antibiotics in last 2 weeks (%)` = pct_anti) %>% pivot_longer(!c(Country, Group)) %>% rename(variable_name = name) %>% pivot_wider(names_from = Group, values_from = value) %>% filter(variable_name != 'number')
baseline_chars_table$Carrier <- replace(baseline_chars_table$Carrier,which(baseline_chars_table$Carrier==0),NA)
baseline_chars_table$`Healthy Control` <- replace(baseline_chars_table$`Healthy Control`,which(baseline_chars_table$`Healthy Control`==0),NA)
baseline_chars_table %>% kbl(digits = c(NA, NA, 1, 1, 1)) %>% kable_styling()
|
Country
|
variable_name
|
Acute typhoid
|
Carrier
|
Healthy Control
|
|
Bangladesh
|
Median age
|
6.0
|
37.0
|
28.5
|
|
Bangladesh
|
Women (%)
|
47.5
|
47.5
|
65.0
|
|
Bangladesh
|
Antibiotics in last 2 weeks (%)
|
37.5
|
NA
|
NA
|
|
Malawi
|
Median age
|
9.7
|
32.0
|
24.0
|
|
Malawi
|
Women (%)
|
64.7
|
57.5
|
65.0
|
|
Malawi
|
Antibiotics in last 2 weeks (%)
|
26.5
|
NA
|
NA
|
|
Nepal
|
Median age
|
17.2
|
43.9
|
35.0
|
|
Nepal
|
Women (%)
|
24.1
|
66.7
|
82.4
|
|
Nepal
|
Antibiotics in last 2 weeks (%)
|
44.8
|
NA
|
NA
|
# the kruskal wallis test is positive, showing a difference between the groups, so we move onto pairwise tests.
# ggplot(metadata, aes(x = Country, y = Age, fill = Group)) + geom_boxplot() + stat_compare_means(method = 'kruskal.test', label = "p")
comparisons_groups <- list(c("Acute typhoid", "Carrier"), c("Acute typhoid", "Healthy Control"), c("Carrier", "Healthy Control"))
comparisons_countries <- list(c('Bangladesh', 'Malawi'), c('Bangladesh', 'Nepal'), c('Malawi', 'Nepal'))
# default stat method when doing pairwise tests in wilcoxon.
ggboxplot(metadata, facet.by = "Country", y = "Age", x = "Group", color = "Group") + stat_compare_means(comparisons = comparisons_groups) + rremove("x.text") + rremove("xlab") + rremove("x.ticks") # +rotate_x_text(angle = 45)

ggboxplot(metadata, facet.by = "Group", y = "Age", x = "Country", color = "Country") + stat_compare_means(comparisons = comparisons_countries) + rotate_x_text(angle = 45)

country_group_sex <- metadata %>% group_by(Country, Group, Sex) %>% summarise(count = n())
plot_sex <- function(eg1, c){
d <- eg1 %>% filter(Country == c)
p <- ggplot(d, aes(x = Group, y = count, fill = Sex)) +
geom_bar(stat ='identity', position = 'fill') +
ylab('Proportion') +
ggtitle(c)# +
#theme(axis.text=element_text(size=34), axis.title=element_text(size=36,face="bold"), plot.title = element_text(size = 40, face = "bold"), legend.key.size = unit(4, 'cm'), legend.title = element_text(size = 34), legend.text = element_text(size = 28))
return(p)
}
m_sex <- plot_sex(country_group_sex, 'Malawi')
b_sex <- plot_sex(country_group_sex, 'Bangladesh')
n_sex <- plot_sex(country_group_sex, 'Nepal')
m_sex / b_sex / n_sex

Healthy vs Typhi
Plot phyla bar
plots
phyla <- strataa_metaphlan_data %>% mutate(clade_name = rownames(strataa_metaphlan_data)) %>% filter(grepl("p__", clade_name)) %>% filter(!grepl("c__", clade_name)) %>% pivot_longer(!c(clade_name, lowest_taxonomic_level), names_to = "sample", values_to = "relative_abundance")
metadata_for_phyla_plots <- strataa_metaphlan_metadata %>% dplyr::select(SampleID, Group, Country)
phyla_clean_metadata <- prep_data_to_plot_phyla(phyla, strataa_metaphlan_metadata)
order_of_groups <- c("Typhi", "Healthy")
bangladesh_phyla_plot <- plot_per_country_abundance(phyla_clean_metadata = phyla_clean_metadata, country = "Bangladesh", group_order = order_of_groups)
# bangladesh_phyla_plot
malawi_phyla_plot <- plot_per_country_abundance(phyla_clean_metadata = phyla_clean_metadata, country = "Malawi", group_order = order_of_groups)
nepal_phyla_plot <- plot_per_country_abundance(phyla_clean_metadata = phyla_clean_metadata, country = "Nepal", group_order = order_of_groups)
bangladesh_phyla_plot / malawi_phyla_plot / nepal_phyla_plot

# going back to phyla so that get all the weird rare ones.
# need to join to meta-data
# remove carriers
# group by clade, group, country
# summarise to get the median.
# pivot wider to get the table.
# dplyr::select(order(colnames(.))) rearranges the columns in alphabetical order
# relocate moves the clade_name column to the first column.
phyla %>% left_join(metadata_for_phyla_plots, by = c("sample" = "SampleID")) %>% filter(Group != 'Carrier') %>% group_by(clade_name, Group, Country) %>% summarise(median_prevalence = median(relative_abundance)) %>% pivot_wider(names_from = c('Country', 'Group'), values_from = 'median_prevalence') %>% dplyr::select(order(colnames(.))) %>% relocate(clade_name, .before = 1) %>% kbl() %>% kable_styling()
|
clade_name
|
Bangladesh_Acute_Typhi
|
Bangladesh_Control_HealthySerosurvey
|
Malawi_Acute_Typhi
|
Malawi_Control_HealthySerosurvey
|
Nepal_Acute_Typhi
|
Nepal_Control_HealthySerosurvey
|
|
k__Archaea|p__Candidatus_Thermoplasmatota
|
0.000000
|
0.000000
|
0.000000
|
0.000000
|
0.00000
|
0.00000
|
|
k__Archaea|p__Euryarchaeota
|
0.000000
|
0.000000
|
0.249010
|
0.000000
|
0.00000
|
0.22474
|
|
k__Archaea|p__Thaumarchaeota
|
0.000000
|
0.000000
|
0.000000
|
0.000000
|
0.00000
|
0.00000
|
|
k__Bacteria|p__Acidobacteria
|
0.000000
|
0.000000
|
0.000000
|
0.000000
|
0.00000
|
0.00000
|
|
k__Bacteria|p__Actinobacteria
|
14.361185
|
13.136440
|
1.893725
|
0.524965
|
3.33634
|
3.81550
|
|
k__Bacteria|p__Bacteria_unclassified
|
0.000000
|
0.000000
|
0.000000
|
0.000000
|
0.00000
|
0.00000
|
|
k__Bacteria|p__Bacteroidetes
|
0.911970
|
2.216730
|
15.361165
|
34.630185
|
28.24679
|
20.53695
|
|
k__Bacteria|p__Candidatus_Melainabacteria
|
0.000000
|
0.000000
|
0.038085
|
0.052210
|
0.00000
|
0.02157
|
|
k__Bacteria|p__Candidatus_Saccharibacteria
|
0.017310
|
0.015625
|
0.000000
|
0.000000
|
0.00000
|
0.00203
|
|
k__Bacteria|p__Chloroflexi
|
0.000000
|
0.000000
|
0.000000
|
0.000000
|
0.00000
|
0.00000
|
|
k__Bacteria|p__Elusimicrobia
|
0.000000
|
0.000000
|
0.000000
|
0.000000
|
0.00000
|
0.00000
|
|
k__Bacteria|p__Firmicutes
|
80.670075
|
72.161565
|
60.161350
|
58.311770
|
53.74090
|
60.17250
|
|
k__Bacteria|p__Fusobacteria
|
0.000000
|
0.000000
|
0.000000
|
0.000000
|
0.00000
|
0.00000
|
|
k__Bacteria|p__Lentisphaerae
|
0.000000
|
0.000000
|
0.000000
|
0.000000
|
0.00000
|
0.00000
|
|
k__Bacteria|p__Proteobacteria
|
0.629715
|
1.272295
|
6.606440
|
6.195335
|
2.90174
|
2.52509
|
|
k__Bacteria|p__Spirochaetes
|
0.000000
|
0.000000
|
0.000000
|
0.000000
|
0.00000
|
0.00000
|
|
k__Bacteria|p__Synergistetes
|
0.000000
|
0.000000
|
0.000000
|
0.000000
|
0.00000
|
0.00000
|
|
k__Bacteria|p__Tenericutes
|
0.000000
|
0.000000
|
0.002715
|
0.000000
|
0.01965
|
0.01362
|
|
k__Bacteria|p__Verrucomicrobia
|
0.000000
|
0.000000
|
0.000000
|
0.000000
|
0.00000
|
0.00000
|
|
k__Eukaryota|p__Ascomycota
|
0.000000
|
0.000000
|
0.000000
|
0.000000
|
0.00000
|
0.00000
|
|
k__Eukaryota|p__Eukaryota_unclassified
|
0.000000
|
0.000000
|
0.000000
|
0.000000
|
0.00000
|
0.00000
|
alpha
diversity
Alpha diversity - all countries, healthy and acute
all_countries_healthy_acute_alpha <- metaphlan_alpha(strataa_metaphlan_data, strataa_metaphlan_metadata, countries_of_interest = c('Bangladesh', 'Malawi', 'Nepal'), groups_of_interest = c('Acute_Typhi', 'Control_HealthySerosurvey'), comparisons = list(c('Acute_Typhi', 'Control_HealthySerosurvey')))

# all_countries_healthy_acute_alpha$alpha_by_country
all_countries_healthy_acute_alpha$alpha_anova_summary_with_var_names %>% dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% kbl() %>% kable_styling()
|
rownames(alpha_anova_summary[[1]])
|
Df
|
Sum.Sq
|
Mean.Sq
|
F.value
|
Pr..F.
|
is_it_significant
|
|
Country
|
2
|
5.74
|
2.87
|
17.4
|
1.36e-07
|
significant
|
|
Country:Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
2
|
1.68
|
0.842
|
5.12
|
0.00699
|
significant
|
|
Sex:Age
|
1
|
1.07
|
1.07
|
6.5
|
0.0117
|
not_significant
|
|
Country:Sex
|
2
|
1.5
|
0.75
|
4.56
|
0.0119
|
not_significant
|
|
Country:Sex:Antibiotics_taken_before_sampling_yes_no_assumptions
|
2
|
1.39
|
0.693
|
4.21
|
0.0165
|
not_significant
|
|
Country:Age
|
2
|
1.02
|
0.51
|
3.1
|
0.0477
|
not_significant
|
|
Sex:Group
|
1
|
0.628
|
0.628
|
3.82
|
0.0525
|
not_significant
|
|
Country:Antibiotics_taken_before_sampling_yes_no_assumptions
|
2
|
0.816
|
0.408
|
2.48
|
0.0871
|
not_significant
|
|
Country:Group
|
2
|
0.698
|
0.349
|
2.12
|
0.123
|
not_significant
|
|
Antibiotics_taken_before_sampling_yes_no_assumptions
|
2
|
0.618
|
0.309
|
1.88
|
0.157
|
not_significant
|
|
Group
|
1
|
0.164
|
0.164
|
0.997
|
0.319
|
not_significant
|
|
Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
2
|
0.359
|
0.179
|
1.09
|
0.339
|
not_significant
|
|
Sex
|
1
|
0.0592
|
0.0592
|
0.36
|
0.55
|
not_significant
|
|
Country:Sex:Age
|
2
|
0.188
|
0.0938
|
0.57
|
0.567
|
not_significant
|
|
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
1
|
0.0511
|
0.0511
|
0.31
|
0.578
|
not_significant
|
|
Sex:Group:Age
|
1
|
0.0368
|
0.0368
|
0.224
|
0.637
|
not_significant
|
|
Group:Age
|
1
|
0.0147
|
0.0147
|
0.0891
|
0.766
|
not_significant
|
|
Country:Group:Age
|
2
|
0.0713
|
0.0357
|
0.217
|
0.805
|
not_significant
|
|
Age
|
1
|
0.00772
|
0.00772
|
0.0469
|
0.829
|
not_significant
|
|
Country:Sex:Group
|
2
|
0.0595
|
0.0297
|
0.181
|
0.835
|
not_significant
|
|
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions
|
1
|
0.00675
|
0.00675
|
0.041
|
0.84
|
not_significant
|
|
Country:Sex:Group:Age
|
2
|
0.05
|
0.025
|
0.152
|
0.859
|
not_significant
|
|
Country:Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
1
|
0.000136
|
0.000136
|
0.000828
|
0.977
|
not_significant
|
|
Residuals
|
163
|
26.8
|
0.165
|
NA
|
NA
|
NA
|
all_countries_healthy_acute_alpha$alpha_plot_group

all_countries_healthy_acute_alpha$alpha_plot_antibiotics

alpha diversity, malawi only
malawi_healthy_acute_alpha <- metaphlan_alpha(strataa_metaphlan_data, strataa_metaphlan_metadata, countries_of_interest = c('Malawi'), groups_of_interest = c('Acute_Typhi', 'Control_HealthySerosurvey'), comparisons = list(c('Acute_Typhi', 'Control_HealthySerosurvey')))

malawi_healthy_acute_alpha$alpha_anova_summary_with_var_names %>% dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% kbl() %>% kable_styling()
|
rownames(alpha_anova_summary[[1]])
|
Df
|
Sum.Sq
|
Mean.Sq
|
F.value
|
Pr..F.
|
is_it_significant
|
|
Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
1
|
2.27
|
2.27
|
16
|
0.000167
|
significant
|
|
Antibiotics_taken_before_sampling_yes_no_assumptions
|
1
|
1.31
|
1.31
|
9.27
|
0.0034
|
significant
|
|
Sex
|
1
|
1.16
|
1.16
|
8.21
|
0.00567
|
significant
|
|
Group
|
1
|
0.84
|
0.84
|
5.93
|
0.0177
|
not_significant
|
|
Age
|
1
|
0.451
|
0.451
|
3.19
|
0.079
|
not_significant
|
|
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions
|
1
|
0.406
|
0.406
|
2.87
|
0.0952
|
not_significant
|
|
Sex:Group
|
1
|
0.313
|
0.313
|
2.21
|
0.142
|
not_significant
|
|
Sex:Group:Age
|
1
|
0.0978
|
0.0978
|
0.691
|
0.409
|
not_significant
|
|
Sex:Age
|
1
|
0.0164
|
0.0164
|
0.116
|
0.734
|
not_significant
|
|
Group:Age
|
1
|
0.00122
|
0.00122
|
0.00864
|
0.926
|
not_significant
|
|
Residuals
|
63
|
8.91
|
0.141
|
NA
|
NA
|
NA
|
malawi_healthy_acute_alpha$alpha_plot_group

malawi_healthy_acute_alpha$alpha_plot_antibiotics

beta diversity
Acute vs healthy.
all_countries_beta_acute_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Bangladesh', 'Malawi', 'Nepal'), c('Acute_Typhi', 'Control_HealthySerosurvey'))
all_countries_beta_acute_healthy$pn_res %>% kbl %>% kable_styling()
|
|
R2
|
Pr(>F)
|
is_it_significant
|
|
Group
|
0.0285252
|
0.0009990
|
significant
|
|
Group:Age
|
0.0117687
|
0.0159840
|
not_significant
|
|
Age
|
0.0092014
|
0.0379620
|
not_significant
|
|
Sex:Group:Age
|
0.0072077
|
0.1148851
|
not_significant
|
|
Sex
|
0.0070879
|
0.1328671
|
not_significant
|
|
Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0117438
|
0.1928072
|
not_significant
|
|
Sex:Age
|
0.0058394
|
0.2427572
|
not_significant
|
|
Sex:Group
|
0.0057923
|
0.2427572
|
not_significant
|
|
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0055130
|
0.2827173
|
not_significant
|
|
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0043096
|
0.5244755
|
not_significant
|
|
Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0070480
|
0.8351648
|
not_significant
|
|
Total
|
1.0000000
|
NA
|
NA
|
|
Residual
|
0.8959630
|
NA
|
NA
|
bgd_beta_acute_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Bangladesh'), c('Acute_Typhi', 'Control_HealthySerosurvey'))
bgd_beta_acute_healthy$pn_res %>% kbl %>% kable_styling()
|
|
R2
|
Pr(>F)
|
is_it_significant
|
|
Group
|
0.0782718
|
0.0009990
|
significant
|
|
Sex
|
0.0230592
|
0.0299700
|
not_significant
|
|
Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0199564
|
0.0759241
|
not_significant
|
|
Sex:Age
|
0.0168558
|
0.1438561
|
not_significant
|
|
Age
|
0.0146689
|
0.2307692
|
not_significant
|
|
Group:Age
|
0.0109377
|
0.4825175
|
not_significant
|
|
Sex:Group:Age
|
0.0100500
|
0.5934066
|
not_significant
|
|
Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0080933
|
0.7442557
|
not_significant
|
|
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0074906
|
0.8351648
|
not_significant
|
|
Sex:Group
|
0.0067483
|
0.9050949
|
not_significant
|
|
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0059228
|
0.9150849
|
not_significant
|
|
Total
|
1.0000000
|
NA
|
NA
|
|
Residual
|
0.7979452
|
NA
|
NA
|
mal_beta_acute_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Malawi'), c('Acute_Typhi', 'Control_HealthySerosurvey'))
mal_beta_acute_healthy$pn_res %>% kbl %>% kable_styling()
|
|
R2
|
Pr(>F)
|
is_it_significant
|
|
Group
|
0.1995900
|
0.0009990
|
significant
|
|
Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0371210
|
0.0009990
|
significant
|
|
Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0381931
|
0.0029970
|
significant
|
|
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0267524
|
0.0159840
|
not_significant
|
|
Sex:Group
|
0.0200496
|
0.0329670
|
not_significant
|
|
Age
|
0.0194908
|
0.0399600
|
not_significant
|
|
Sex:Age
|
0.0144797
|
0.1358641
|
not_significant
|
|
Sex
|
0.0141068
|
0.1388611
|
not_significant
|
|
Sex:Group:Age
|
0.0091754
|
0.4935065
|
not_significant
|
|
Group:Age
|
0.0078947
|
0.6063936
|
not_significant
|
|
Total
|
1.0000000
|
NA
|
NA
|
|
Residual
|
0.6131466
|
NA
|
NA
|
nep_beta_acute_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Nepal'), c('Acute_Typhi', 'Control_HealthySerosurvey'))
nep_beta_acute_healthy$pn_res %>% kbl %>% kable_styling()
|
|
R2
|
Pr(>F)
|
is_it_significant
|
|
Group
|
0.0598990
|
0.0029970
|
significant
|
|
Sex
|
0.0484279
|
0.0129870
|
not_significant
|
|
Sex:Group
|
0.0246755
|
0.3026973
|
not_significant
|
|
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0244159
|
0.3226773
|
not_significant
|
|
Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0450942
|
0.3896104
|
not_significant
|
|
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0207951
|
0.4775225
|
not_significant
|
|
Sex:Group:Age
|
0.0175717
|
0.6963037
|
not_significant
|
|
Age
|
0.0154038
|
0.8061938
|
not_significant
|
|
Sex:Age
|
0.0148604
|
0.8171828
|
not_significant
|
|
Group:Age
|
0.0102689
|
0.9820180
|
not_significant
|
|
Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0188359
|
0.9990010
|
not_significant
|
|
Total
|
1.0000000
|
NA
|
NA
|
|
Residual
|
0.6997518
|
NA
|
NA
|
all_countries_beta_acute_healthy$pc12 + bgd_beta_acute_healthy$pc12 + mal_beta_acute_healthy$pc12 + nep_beta_acute_healthy$pc12

maaslin2
taxonomy
Maaslin basics
groups_to_analyse <- c('Acute_Typhi', 'Control_HealthySerosurvey')
bang_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions")
mwi_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions", "sequencing_lane")
nep_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions")
bangladesh_taxonomic_maaslin <- read_in_maaslin('Bangladesh', groups_to_analyse, bang_variables_for_analysis, 'metaphlan')
malawi_taxonomic_maaslin <- read_in_maaslin('Malawi', groups_to_analyse, mwi_variables_for_analysis, 'metaphlan')
nepal_taxonomic_maaslin <- read_in_maaslin('Nepal', groups_to_analyse, nep_variables_for_analysis, 'metaphlan')
bangladesh_taxonomic_maaslin_filtered <- filter_taxonomic_maaslin(bangladesh_taxonomic_maaslin)
malawi_taxonomic_maaslin_filtered <- filter_taxonomic_maaslin(malawi_taxonomic_maaslin)
nepal_taxonomic_maaslin_filtered <- filter_taxonomic_maaslin(nepal_taxonomic_maaslin)
bangladesh_maaslin_stats <- basic_maaslin_stats(bangladesh_taxonomic_maaslin_filtered, 'Bangladesh', bang_variables_for_analysis, groups_to_analyse)
malawi_maaslin_stats <- basic_maaslin_stats(malawi_taxonomic_maaslin_filtered, 'Malawi', mwi_variables_for_analysis, groups_to_analyse)
nepal_maaslin_stats <- basic_maaslin_stats(nepal_taxonomic_maaslin_filtered, 'Nepal', nep_variables_for_analysis, groups_to_analyse)
malawi_maaslin_stats$volcano_plot / bangladesh_maaslin_stats$volcano_plot / nepal_maaslin_stats$volcano_plot

nrow(malawi_maaslin_stats$maaslin_results_sig)
## [1] 296
nrow(bangladesh_maaslin_stats$maaslin_results_sig)
## [1] 23
nrow(nepal_maaslin_stats$maaslin_results_sig)
## [1] 0
There were 296 species significantly (q-val < 0.05) associated
with health/disease in Malawi, in Bangladesh, and in Nepal.
Combine the taxonomic maaslins, and print out the species that are
sig in both and share directions.
Because sequencing run and participant type are totally confounded
for Bangladesh, need to exclude sequencing run from the final model for
Bangladesh (otherwise, wipes out the signals).
associated at both sites
bang_mal <- list(bangladesh_taxonomic_maaslin_filtered, malawi_taxonomic_maaslin_filtered)
combined_results <- run_combine_maaslins(bang_mal, c('_bang', '_mal'), mwi_variables_for_analysis, groups_to_analyse, 'metaphlan', maaslin_taxonomic_output_root_folder)
# View(combined_results$positive_coef)
# View(combined_results$negative_coef)
combined_results$positive_coef %>% filter(grepl('^s', lowest_taxonomic_level)) %>%
select(!c(metadata, value, N_bang, N.not.0_bang, pval_bang, N_mal, N.not.0_mal, pval_mal)) %>%
rename(Species = lowest_taxonomic_level, `Coefficient Bangladesh` = coef_bang, `Standard Error Bangladesh` = stderr_bang, `Q-value Bangladesh` = qval_bang, `Coefficient Malawi` = coef_mal, `Standard Error Malawi` = stderr_mal, `Q-value Malawi` = qval_mal) %>%
dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>%
kbl() %>% kable_styling()
|
feature
|
Species
|
Coefficient Bangladesh
|
Standard Error Bangladesh
|
Q-value Bangladesh
|
Coefficient Malawi
|
Standard Error Malawi
|
Q-value Malawi
|
|
k__Bacteria.p__Firmicutes.c__Clostridia.o__Eubacteriales.f__Clostridiaceae.g__Clostridium.s__Clostridium_SGB6179
|
s__Clostridium_SGB6179
|
8.79
|
1.34
|
4.25e-05
|
3.11
|
0.936
|
0.0286
|
|
k__Bacteria.p__Bacteroidetes.c__Bacteroidia.o__Bacteroidales.f__Prevotellaceae.g__Prevotella.s__Prevotella_copri_clade_A
|
s__Prevotella_copri_clade_A
|
4.54
|
0.978
|
0.00971
|
7.64
|
1.25
|
1.21e-05
|
|
k__Bacteria.p__Firmicutes.c__Negativicutes.o__Veillonellales.f__Veillonellaceae.g__GGB4266.s__GGB4266_SGB5809
|
s__GGB4266_SGB5809
|
4.58
|
1.09
|
0.0147
|
6.94
|
0.981
|
4.38e-07
|
|
k__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Pasteurellales.f__Pasteurellaceae.g__Haemophilus.s__Haemophilus_parainfluenzae
|
s__Haemophilus_parainfluenzae
|
3.64
|
0.979
|
0.03
|
6.92
|
0.953
|
2.26e-07
|
|
k__Bacteria.p__Firmicutes.c__Clostridia.o__Eubacteriales.f__Peptostreptococcaceae.g__Romboutsia.s__Romboutsia_timonensis
|
s__Romboutsia_timonensis
|
4.52
|
1.25
|
0.0386
|
5.07
|
0.903
|
5.91e-05
|
combined_results$negative_coef %>% filter(grepl('^s', lowest_taxonomic_level)) %>%
select(!c(metadata, value, N_bang, N.not.0_bang, pval_bang, N_mal, N.not.0_mal, pval_mal)) %>%
rename(Species = lowest_taxonomic_level, `Coefficient Bangladesh` = coef_bang, `Standard Error Bangladesh` = stderr_bang, `Q-value Bangladesh` = qval_bang, `Coefficient Malawi` = coef_mal, `Standard Error Malawi` = stderr_mal, `Q-value Malawi` = qval_mal) %>%
dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>%
kbl() %>% kable_styling()
|
feature
|
Species
|
Coefficient Bangladesh
|
Standard Error Bangladesh
|
Q-value Bangladesh
|
Coefficient Malawi
|
Standard Error Malawi
|
Q-value Malawi
|
|
k__Bacteria.p__Firmicutes.c__Clostridia.o__Eubacteriales.f__Lachnospiraceae.g__Lachnospiraceae_unclassified.s__Lachnospiraceae_bacterium
|
s__Lachnospiraceae_bacterium
|
-3.05
|
0.838
|
0.0366
|
-2.87
|
0.785
|
0.0135
|
# todo - refactoring the run_combine_maaslins means that we dont get the species that are only associated at one site. need to fix that.
# nrow(combined_results$mwi_maaslin_only)
# nrow(combined_results$bang_maaslin_only)
There were species significantly (q-val < 0.05) associated with
health/disease in Malawi only and in Bangladesh only.
The ones associated at only one site are written out to a file, you
can look at them manually there.
plots of species of
interest
strataa_metaphlan_data_longer <- strataa_metaphlan_data %>% mutate(feature = rownames(strataa_metaphlan_data)) %>% pivot_longer(!c(feature, lowest_taxonomic_level), names_to = "SampleID", values_to = "prevalence")
strataa_metaphlan_data_longer_meta <- strataa_metaphlan_data_longer %>% left_join(strataa_metaphlan_metadata, by = c("SampleID" = "SampleID"))
pc <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Prevotella_copri_clade_A')

cs <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Clostridium_SGB6179')

SGB5809 <-run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__GGB4266_SGB5809')

hp <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Haemophilus_parainfluenzae')

rt <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Romboutsia_timonensis')

lb <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Lachnospiraceae_bacterium')

# pc + cs + SGB5809 + hp + rt + lb
P. copri
clades
Print the results for all the P. copri clades
# all_taxa_maaslin <- combined_maaslins$all_features
# p_copri_maaslin <- all_taxa_maaslin %>% filter(grepl('_Prevotella_copri_', feature)) %>% filter(qval_bang < 0.05 | qval_mal < 0.05)
# write_csv(p_copri_maaslin, file.path(maaslin_taxonomic_output_root_folder, 'combined_maaslins_p_copri.csv'))
maaslin functional
groups
Functional groups associated with health
bang_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions")
mwi_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions", "sequencing_lane")
# nep_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions")
groups_to_analyse <- c('Acute_Typhi', 'Control_HealthySerosurvey')
bang_func_maaslin <- read_in_maaslin('Bangladesh', groups_to_analyse, bang_variables_for_analysis, 'bigmap')
mal_func_maaslin <- read_in_maaslin('Malawi', groups_to_analyse, mwi_variables_for_analysis, 'bigmap')
# combined_results <- run_combine_maaslins(bang_func_maaslin, mal_func_maaslin, bang_variables_for_analysis, mwi_variables_for_analysis, groups_to_analyse, 'bigmap', maaslin_functional_output_root_folder)
bang_mal <- list(bang_func_maaslin, mal_func_maaslin)
combined_results <- run_combine_maaslins(bang_mal, c('_bang', '_mal'), mwi_variables_for_analysis, groups_to_analyse, 'bigmap', maaslin_functional_output_root_folder)
combined_results$positive_coef %>%
select(!c(metadata, value, N_bang, N.not.0_bang, pval_bang, N_mal, N.not.0_mal, pval_mal)) %>%
rename(`Coefficient Bangladesh` = coef_bang, `Standard Error Bangladesh` = stderr_bang, `Q-value Bangladesh` = qval_bang, `Coefficient Malawi` = coef_mal, `Standard Error Malawi` = stderr_mal, `Q-value Malawi` = qval_mal) %>%
dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>%
kbl() %>% kable_styling()
|
MGC_class
|
Species
|
feature
|
Coefficient Bangladesh
|
Standard Error Bangladesh
|
Q-value Bangladesh
|
Coefficient Malawi
|
Standard Error Malawi
|
Q-value Malawi
|
|
succinate2propionate
|
Dialister_invisus_DSM_15470_genomic_scaffold
|
gb.GG698602.1.region002.GC_DNA..Entryname.succinate2propionate..OS.Dialister_invisus_DSM_15470_genomic_scaffold..SMASHregion.region002..NR.1
|
4.49
|
0.769
|
0.000238
|
4.32
|
0.937
|
0.00196
|
|
Pyruvate2acetate.formate
|
Prevotella_copri_strain_AM22.19
|
gb.QRIF01000001.1.region001.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Prevotella_copri_strain_AM22.19..SMASHregion.region001..NR.7
|
4.95
|
0.865
|
0.000259
|
4.76
|
1.23
|
0.0122
|
|
TPP_AA_metabolism
|
Prevotella_sp._S7_MS_2
|
gb.JRPT01000001.1.region001.GC_DNA..Entryname.TPP_AA_metabolism..OS.Prevotella_sp._S7_MS_2..SMASHregion.region001..NR.11..BG.2
|
4.86
|
0.89
|
0.000381
|
5.14
|
1.16
|
0.00314
|
|
Arginine2_Hcarbonate
|
.Clostridium._sordellii_genome_assembly
|
gb.LN679998.1.region007.GC_DNA..Entryname.Arginine2_Hcarbonate..OS..Clostridium._sordellii_genome_assembly..SMASHregion.region007..NR.2
|
4.59
|
0.824
|
0.000381
|
3.51
|
0.732
|
0.00116
|
|
Others_HGD_unassigned
|
Prevotella_copri_strain_AF24.12
|
gb.QRVA01000039.1.region001.GC_DNA..Entryname.Others_HGD_unassigned..OS.Prevotella_copri_strain_AF24.12..SMASHregion.region001..NR.6
|
4.72
|
0.862
|
0.000381
|
5.74
|
1.09
|
0.000359
|
|
Pyruvate2acetate.formate
|
Prevotella_sp._AM34.19LB
|
gb.QSIG01000002.1.region002.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Prevotella_sp._AM34.19LB..SMASHregion.region002..NR.3
|
4.25
|
0.783
|
0.000381
|
3.28
|
0.956
|
0.0336
|
|
TPP_fatty_acids
|
Prevotella_copri_strain_AM22.19
|
gb.QRIF01000003.1.region001.GC_DNA..Entryname.TPP_fatty_acids..OS.Prevotella_copri_strain_AM22.19..SMASHregion.region001..NR.6
|
5.07
|
0.952
|
0.000422
|
5.33
|
1.07
|
0.000706
|
|
Rnf_complex
|
Prevotella_copri_strain_AF38.11
|
gb.QROP01000022.1.region001.GC_DNA..Entryname.Rnf_complex..OS.Prevotella_copri_strain_AF38.11..SMASHregion.region001..NR.6
|
4.74
|
0.914
|
0.000649
|
5.23
|
1.12
|
0.00176
|
|
PFOR_II_pathway
|
Romboutsia_ilealis_strain_CRIB_genome
|
gb.LN555523.1.region001.GC_DNA..Entryname.PFOR_II_pathway..OS.Romboutsia_ilealis_strain_CRIB_genome..SMASHregion.region001..NR.1
|
4.84
|
0.983
|
0.00104
|
2.98
|
0.719
|
0.00629
|
|
Others_HGD_unassigned
|
Romboutsia_ilealis_strain_CRIB_genome
|
gb.LN555523.1.region003.GC_DNA..Entryname.Others_HGD_unassigned..OS.Romboutsia_ilealis_strain_CRIB_genome..SMASHregion.region003..NR.1
|
4.59
|
0.935
|
0.00105
|
2.76
|
0.558
|
0.000812
|
|
porA
|
Prevotella_copri_strain_AF10.17
|
gb.QSAV01000013.1.region001.GC_DNA..Entryname.porA..OS.Prevotella_copri_strain_AF10.17..SMASHregion.region001..NR.6
|
4.46
|
0.918
|
0.00124
|
5.08
|
1.17
|
0.00387
|
|
Rnf_complex
|
Romboutsia_ilealis_strain_CRIB_genome
|
gb.LN555523.1.region002.GC_DNA..Entryname.Rnf_complex..OS.Romboutsia_ilealis_strain_CRIB_genome..SMASHregion.region002..NR.1
|
4.36
|
0.957
|
0.0031
|
2.83
|
0.622
|
0.00228
|
|
PFOR_II_pathway
|
Romboutsia_ilealis_strain_CRIB_genome
|
gb.LN555523.1.region004.GC_DNA..Entryname.PFOR_II_pathway..OS.Romboutsia_ilealis_strain_CRIB_genome..SMASHregion.region004..NR.1
|
4.39
|
0.968
|
0.00318
|
3.37
|
0.814
|
0.00646
|
|
Formate_dehydrogenase
|
Haemophilus_sp._HMSC61B11_genomic_scaffold
|
gb.KV837963.1.region001.GC_DNA..Entryname.Formate_dehydrogenase..OS.Haemophilus_sp._HMSC61B11_genomic_scaffold..SMASHregion.region001..NR.5
|
3.41
|
0.754
|
0.00327
|
4.89
|
0.985
|
0.000767
|
|
Pyruvate2acetate.formate.Acetyl.CoA_pathway
|
Romboutsia_ilealis_strain_CRIB_genome
|
gb.LN555523.1.region005.GC_DNA..Entryname.Pyruvate2acetate.formate.Acetyl.CoA_pathway..OS.Romboutsia_ilealis_strain_CRIB_genome..SMASHregion.region005..NR.1
|
4.26
|
1
|
0.0072
|
4.58
|
1.11
|
0.00659
|
|
Pyruvate2acetate.formate
|
Clostridium_carboxidivorans
|
gb.CP011803.1.region013.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Clostridium_carboxidivorans..SMASHregion.region013..NR.1
|
3.69
|
0.89
|
0.00954
|
3.11
|
0.762
|
0.00733
|
|
OD_eut_pdu_related.PFOR_II_pathway
|
Paeniclostridium_sordellii_strain_AM370
|
gb.CP014150.1.region011.GC_DNA..Entryname.OD_eut_pdu_related.PFOR_II_pathway..OS.Paeniclostridium_sordellii_strain_AM370..SMASHregion.region011..NR.4
|
2.61
|
0.629
|
0.00954
|
3.94
|
0.574
|
4.58e-06
|
|
TPP_AA_metabolism.Arginine2putrescine
|
Haemophilus_sp._HMSC61B11_genomic_scaffold
|
gb.KV837955.1.region001.GC_DNA..Entryname.TPP_AA_metabolism.Arginine2putrescine..OS.Haemophilus_sp._HMSC61B11_genomic_scaffold..SMASHregion.region001..NR.5
|
3.93
|
0.951
|
0.00959
|
4.39
|
0.918
|
0.0012
|
|
Respiratory_glycerol
|
Haemophilus_parainfluenzae_HK2019
|
gb.AJTC01000042.1.region001.GC_DNA..Entryname.Respiratory_glycerol..OS.Haemophilus_parainfluenzae_HK2019..SMASHregion.region001..NR.5
|
3.55
|
0.882
|
0.0136
|
5.14
|
0.975
|
0.000344
|
|
Pyruvate2acetate.formate
|
Haemophilus_parainfluenzae_HK262
|
gb.AJMW01000041.1.region001.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Haemophilus_parainfluenzae_HK262..SMASHregion.region001..NR.5
|
3.17
|
0.791
|
0.0137
|
5.08
|
1.01
|
0.000657
|
|
TPP_AA_metabolism
|
Clostridium_celatum_DSM_1785_genomic_scaffold
|
gb.KB291615.1.region002.GC_DNA..Entryname.TPP_AA_metabolism..OS.Clostridium_celatum_DSM_1785_genomic_scaffold..SMASHregion.region002..NR.1
|
3.24
|
0.813
|
0.0138
|
2.5
|
0.72
|
0.0303
|
|
Arginine2putrescine.Putrescine2spermidine
|
Romboutsia_sp._Frifi_strain_FRIFI_genome
|
gb.LN650648.1.region002.GC_DNA..Entryname.Arginine2putrescine.Putrescine2spermidine..OS.Romboutsia_sp._Frifi_strain_FRIFI_genome..SMASHregion.region002..NR.1
|
2.66
|
0.683
|
0.0177
|
2.57
|
0.523
|
0.000868
|
|
Pyruvate2acetate.formate
|
Haemophilus_haemolyticus_HK386
|
gb.AJSV01000030.1.region001.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Haemophilus_haemolyticus_HK386..SMASHregion.region001..NR.2
|
2.38
|
0.631
|
0.0234
|
5.02
|
0.724
|
3.95e-06
|
|
Pyruvate2acetate.formate
|
Haemophilus_aegyptius_ATCC_11116_genomic_scaffold
|
gb.GL878527.1.region001.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Haemophilus_aegyptius_ATCC_11116_genomic_scaffold..SMASHregion.region001..NR.2
|
2.16
|
0.574
|
0.0236
|
3.52
|
0.679
|
0.000428
|
|
acetate2butyrate
|
Clostridium_botulinum_B_str._Eklund
|
gb.CP001056.1.region002.GC_DNA..Entryname.acetate2butyrate..OS.Clostridium_botulinum_B_str._Eklund..SMASHregion.region002..NR.5..BG.2
|
2.42
|
0.646
|
0.0245
|
2.53
|
0.754
|
0.0396
|
|
OD_fatty_acids
|
Dialister_succinatiphilus_YIT_11850_genomic_scaffold
|
gb.JH591188.1.region001.GC_DNA..Entryname.OD_fatty_acids..OS.Dialister_succinatiphilus_YIT_11850_genomic_scaffold..SMASHregion.region001..NR.1
|
2.94
|
0.793
|
0.0265
|
2.54
|
0.579
|
0.00342
|
|
Pyruvate2acetate.formate
|
Clostridium_butyricum_strain_JKY6D1_chromosome
|
gb.CP013352.1.region004.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Clostridium_butyricum_strain_JKY6D1_chromosome..SMASHregion.region004..NR.6
|
3.46
|
0.938
|
0.0265
|
3.35
|
0.936
|
0.0241
|
|
TPP_AA_metabolism
|
Haemophilus_sputorum_HK_2154
|
gb.ALJP01000004.1.region001.GC_DNA..Entryname.TPP_AA_metabolism..OS.Haemophilus_sputorum_HK_2154..SMASHregion.region001..NR.1
|
2.96
|
0.812
|
0.0299
|
4.52
|
0.866
|
0.000392
|
|
Fumarate2succinate
|
Haemophilus_sp._HMSC61B11_genomic_scaffold
|
gb.KV838003.1.region001.GC_DNA..Entryname.Fumarate2succinate..OS.Haemophilus_sp._HMSC61B11_genomic_scaffold..SMASHregion.region001..NR.5
|
3.34
|
0.924
|
0.0312
|
4.62
|
1.01
|
0.0022
|
|
Respiratory_glycerol
|
Aggregatibacter_sp._oral_taxon_458_str._W10330_genomic_scaffold
|
gb.KE952617.1.region001.GC_DNA..Entryname.Respiratory_glycerol..OS.Aggregatibacter_sp._oral_taxon_458_str._W10330_genomic_scaffold..SMASHregion.region001..NR.1
|
2.91
|
0.808
|
0.0323
|
4.64
|
0.926
|
0.000678
|
|
TPP_AA_metabolism
|
Haemophilus_pittmaniae_HK_85
|
gb.AFUV01000010.1.region001.GC_DNA..Entryname.TPP_AA_metabolism..OS.Haemophilus_pittmaniae_HK_85..SMASHregion.region001..NR.1
|
2.43
|
0.686
|
0.0353
|
5.22
|
0.896
|
8.17e-05
|
|
PFOR_II_pathway
|
Clostridium_celatum_DSM_1785_genomic_scaffold
|
gb.KB291660.1.region001.GC_DNA..Entryname.PFOR_II_pathway..OS.Clostridium_celatum_DSM_1785_genomic_scaffold..SMASHregion.region001..NR.1
|
3.51
|
0.994
|
0.0353
|
3.77
|
0.982
|
0.0132
|
|
acetate2butyrate
|
Clostridium_celatum_DSM_1785_genomic_scaffold
|
gb.KB291615.1.region001.GC_DNA..Entryname.acetate2butyrate..OS.Clostridium_celatum_DSM_1785_genomic_scaffold..SMASHregion.region001..NR.1
|
3.41
|
0.981
|
0.0397
|
3.41
|
0.931
|
0.0197
|
|
Rnf_complex
|
Haemophilus_parainfluenzae_HK262
|
gb.AJMW01000064.1.region001.GC_DNA..Entryname.Rnf_complex..OS.Haemophilus_parainfluenzae_HK262..SMASHregion.region001..NR.6..BG.2
|
2.8
|
0.814
|
0.0418
|
5
|
0.992
|
0.000636
|
|
OD_fatty_acids.PFOR_II_pathway
|
Clostridium_botulinum_B_str._Eklund
|
gb.CP001056.1.region005.GC_DNA..Entryname.OD_fatty_acids.PFOR_II_pathway..OS.Clostridium_botulinum_B_str._Eklund..SMASHregion.region005..NR.5..BG.2
|
2.57
|
0.772
|
0.0497
|
3.16
|
0.835
|
0.0148
|
combined_results$negative_coef %>%
select(!c(metadata, value, N_bang, N.not.0_bang, pval_bang, N_mal, N.not.0_mal, pval_mal)) %>%
rename(`Coefficient Bangladesh` = coef_bang, `Standard Error Bangladesh` = stderr_bang, `Q-value Bangladesh` = qval_bang, `Coefficient Malawi` = coef_mal, `Standard Error Malawi` = stderr_mal, `Q-value Malawi` = qval_mal) %>%
dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>%
kbl() %>% kable_styling()
|
MGC_class
|
Species
|
feature
|
Coefficient Bangladesh
|
Standard Error Bangladesh
|
Q-value Bangladesh
|
Coefficient Malawi
|
Standard Error Malawi
|
Q-value Malawi
|
|
NADH_dehydrogenase_I
|
Actinomyces_viscosus_C505_genomic_scaffold
|
gb.KI391968.1.region002.GC_DNA..Entryname.NADH_dehydrogenase_I..OS.Actinomyces_viscosus_C505_genomic_scaffold..SMASHregion.region002..NR.5
|
-3.01
|
0.574
|
0.000568
|
-2.9
|
0.796
|
0.0205
|
|
Nitrate_reductase
|
Actinomyces_johnsonii_F0510_genomic_scaffold
|
gb.KE951633.1.region001.GC_DNA..Entryname.Nitrate_reductase..OS.Actinomyces_johnsonii_F0510_genomic_scaffold..SMASHregion.region001..NR.2
|
-2.76
|
0.544
|
0.000832
|
-2.26
|
0.657
|
0.0332
|
|
Rnf_complex.TPP_AA_metabolism
|
Eubacterium_siraeum_DSM_15702_Scfld_03_40_genomic
|
gb.DS499548.1.region001.GC_DNA..Entryname.Rnf_complex.TPP_AA_metabolism..OS.Eubacterium_siraeum_DSM_15702_Scfld_03_40_genomic..SMASHregion.region001..NR.2
|
-2.44
|
0.727
|
0.047
|
-4.29
|
1.23
|
0.0298
|
combined_results$positive_coef %>% mutate(split_species = str_split(Species, '_')) %>% mutate(genus = sapply(split_species, "[[", 1)) %>% mutate(specific = sapply(split_species, "[[", 2)) %>% mutate(clean_species = paste(genus, specific, sep = ' ')) %>% select(-split_species, -genus, -specific) %>% relocate(clean_species, .after = Species) %>% group_by(MGC_class) %>% arrange(Species) %>% summarise(n = n(), Species = paste(clean_species, collapse = ', ')) %>% arrange(desc(n), MGC_class) %>% kbl() %>% kable_styling()
|
MGC_class
|
n
|
Species
|
|
Pyruvate2acetate.formate
|
7
|
Clostridium butyricum, Clostridium carboxidivorans, Haemophilus
aegyptius, Haemophilus haemolyticus, Haemophilus parainfluenzae,
Prevotella copri, Prevotella sp.
|
|
TPP_AA_metabolism
|
4
|
Clostridium celatum, Haemophilus pittmaniae, Haemophilus sputorum,
Prevotella sp.
|
|
PFOR_II_pathway
|
3
|
Clostridium celatum, Romboutsia ilealis, Romboutsia ilealis
|
|
Rnf_complex
|
3
|
Haemophilus parainfluenzae, Prevotella copri, Romboutsia ilealis
|
|
Others_HGD_unassigned
|
2
|
Prevotella copri, Romboutsia ilealis
|
|
Respiratory_glycerol
|
2
|
Aggregatibacter sp., Haemophilus parainfluenzae
|
|
acetate2butyrate
|
2
|
Clostridium botulinum, Clostridium celatum
|
|
Arginine2_Hcarbonate
|
1
|
.Clostridium. sordellii
|
|
Arginine2putrescine.Putrescine2spermidine
|
1
|
Romboutsia sp.
|
|
Formate_dehydrogenase
|
1
|
Haemophilus sp.
|
|
Fumarate2succinate
|
1
|
Haemophilus sp.
|
|
OD_eut_pdu_related.PFOR_II_pathway
|
1
|
Paeniclostridium sordellii
|
|
OD_fatty_acids
|
1
|
Dialister succinatiphilus
|
|
OD_fatty_acids.PFOR_II_pathway
|
1
|
Clostridium botulinum
|
|
Pyruvate2acetate.formate.Acetyl.CoA_pathway
|
1
|
Romboutsia ilealis
|
|
TPP_AA_metabolism.Arginine2putrescine
|
1
|
Haemophilus sp.
|
|
TPP_fatty_acids
|
1
|
Prevotella copri
|
|
porA
|
1
|
Prevotella copri
|
|
succinate2propionate
|
1
|
Dialister invisus
|
combined_results$negative_coef %>% mutate(split_species = str_split(Species, '_')) %>% mutate(genus = sapply(split_species, "[[", 1)) %>% mutate(specific = sapply(split_species, "[[", 2)) %>% mutate(clean_species = paste(genus, specific, sep = ' ')) %>% select(-split_species, -genus, -specific) %>% relocate(clean_species, .after = Species) %>% group_by(MGC_class) %>% arrange(Species) %>% summarise(n = n(), Species = paste(clean_species, collapse = ', ')) %>% arrange(desc(n), MGC_class) %>% kbl() %>% kable_styling()
|
MGC_class
|
n
|
Species
|
|
NADH_dehydrogenase_I
|
1
|
Actinomyces viscosus
|
|
Nitrate_reductase
|
1
|
Actinomyces johnsonii
|
|
Rnf_complex.TPP_AA_metabolism
|
1
|
Eubacterium siraeum
|
healthy vs
carrier
phylum plots
order_of_groups <- c("Carrier", "Healthy")
# bangladesh_phyla_plot <- plot_per_country_abundance(phyla_clean_metadata = phyla_clean_metadata, country = "Bangladesh", group_order = order_of_groups)
# bangladesh_phyla_plot
malawi_phyla_plot <- plot_per_country_abundance(phyla_clean_metadata = phyla_clean_metadata, country = "Malawi", group_order = order_of_groups)
nepal_phyla_plot <- plot_per_country_abundance(phyla_clean_metadata = phyla_clean_metadata, country = "Nepal", group_order = order_of_groups)
# bangladesh_phyla_plot /
malawi_phyla_plot / nepal_phyla_plot

alpha
diversity
Alpha diversity - all countries, healthy and carrier
all_countries_healthy_carrier_alpha <- metaphlan_alpha(strataa_metaphlan_data, strataa_metaphlan_metadata, countries_of_interest = c('Malawi', 'Nepal'), groups_of_interest = c('Carrier', 'Control_HealthySerosurvey'), comparisons = list(c('Carrier', 'Control_HealthySerosurvey')))
all_countries_healthy_carrier_alpha$alpha_by_country

all_countries_healthy_carrier_alpha$alpha_anova_summary_with_var_names %>% dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% kbl() %>% kable_styling()
|
rownames(alpha_anova_summary[[1]])
|
Df
|
Sum.Sq
|
Mean.Sq
|
F.value
|
Pr..F.
|
is_it_significant
|
|
Country:Group
|
1
|
4.82
|
4.82
|
40.9
|
3.97e-09
|
significant
|
|
Group
|
1
|
1.41
|
1.41
|
11.9
|
0.000786
|
significant
|
|
Group:Age
|
1
|
0.639
|
0.639
|
5.42
|
0.0217
|
not_significant
|
|
Country:Sex:Age
|
1
|
0.467
|
0.467
|
3.96
|
0.0491
|
not_significant
|
|
Country
|
1
|
0.438
|
0.438
|
3.72
|
0.0564
|
not_significant
|
|
Sex:Group
|
1
|
0.35
|
0.35
|
2.97
|
0.0875
|
not_significant
|
|
Age
|
1
|
0.0966
|
0.0966
|
0.819
|
0.367
|
not_significant
|
|
Country:Sex:Group
|
1
|
0.0738
|
0.0738
|
0.626
|
0.43
|
not_significant
|
|
Sex:Age
|
1
|
0.0673
|
0.0673
|
0.571
|
0.451
|
not_significant
|
|
Country:Group:Age
|
1
|
0.0151
|
0.0151
|
0.128
|
0.721
|
not_significant
|
|
Country:Sex
|
1
|
0.0139
|
0.0139
|
0.118
|
0.732
|
not_significant
|
|
Sex
|
1
|
0.00909
|
0.00909
|
0.0771
|
0.782
|
not_significant
|
|
Country:Age
|
1
|
0.00766
|
0.00766
|
0.065
|
0.799
|
not_significant
|
|
Country:Sex:Group:Age
|
1
|
0.00184
|
0.00184
|
0.0156
|
0.901
|
not_significant
|
|
Sex:Group:Age
|
1
|
6.12e-06
|
6.12e-06
|
5.19e-05
|
0.994
|
not_significant
|
|
Residuals
|
111
|
13.1
|
0.118
|
NA
|
NA
|
NA
|
all_countries_healthy_carrier_alpha$alpha_plot_group

beta diversity
Carrier vs healthy.
all_countries_beta_carrier_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Malawi', 'Nepal'), c('Carrier', 'Control_HealthySerosurvey'))
all_countries_beta_carrier_healthy$pn_res %>% kbl %>% kable_styling()
|
|
R2
|
Pr(>F)
|
is_it_significant
|
|
Group
|
0.0653601
|
0.0009990
|
significant
|
|
Sex:Age
|
0.0141431
|
0.0199800
|
not_significant
|
|
Group:Age
|
0.0127135
|
0.0469530
|
not_significant
|
|
Age
|
0.0090662
|
0.2367632
|
not_significant
|
|
Sex:Group
|
0.0077753
|
0.3676324
|
not_significant
|
|
Sex
|
0.0072119
|
0.4195804
|
not_significant
|
|
Sex:Group:Age
|
0.0068119
|
0.5054945
|
not_significant
|
|
Total
|
1.0000000
|
NA
|
NA
|
|
Residual
|
0.8769180
|
NA
|
NA
|
# bgd_beta_carrier_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Bangladesh'), c('Carrier', 'Control_HealthySerosurvey'))
# bgd_beta_carrier_healthy$pn_res %>% kbl %>% kable_styling()
mal_beta_carrier_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Malawi'), c('Carrier', 'Control_HealthySerosurvey'))
mal_beta_carrier_healthy$pn_res %>% kbl %>% kable_styling()
|
|
R2
|
Pr(>F)
|
is_it_significant
|
|
Group
|
0.1999878
|
0.0009990
|
significant
|
|
Sex:Age
|
0.0304467
|
0.0049950
|
significant
|
|
Age
|
0.0187062
|
0.0519481
|
not_significant
|
|
Group:Age
|
0.0147510
|
0.1348651
|
not_significant
|
|
Sex:Group
|
0.0118411
|
0.2627373
|
not_significant
|
|
Sex
|
0.0099215
|
0.4385614
|
not_significant
|
|
Sex:Group:Age
|
0.0046358
|
0.9080919
|
not_significant
|
|
Total
|
1.0000000
|
NA
|
NA
|
|
Residual
|
0.7097099
|
NA
|
NA
|
nep_beta_carrier_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Nepal'), c('Carrier', 'Control_HealthySerosurvey'))
nep_beta_carrier_healthy$pn_res %>% kbl %>% kable_styling()
|
|
R2
|
Pr(>F)
|
is_it_significant
|
|
Group
|
0.0836999
|
0.0009990
|
significant
|
|
Sex:Group
|
0.0241616
|
0.2477522
|
not_significant
|
|
Sex:Age
|
0.0240379
|
0.2687313
|
not_significant
|
|
Age
|
0.0195876
|
0.4735265
|
not_significant
|
|
Sex
|
0.0174225
|
0.6013986
|
not_significant
|
|
Sex:Group:Age
|
0.0149025
|
0.7352647
|
not_significant
|
|
Group:Age
|
0.0098712
|
0.9870130
|
not_significant
|
|
Total
|
1.0000000
|
NA
|
NA
|
|
Residual
|
0.8063168
|
NA
|
NA
|
mal_beta_carrier_healthy$pc12 + nep_beta_carrier_healthy$pc12 + plot_layout(guides = 'collect')

maaslin taxonomic
groups
groups_to_analyse <- c('Carrier', 'Control_HealthySerosurvey')
# bang_variables_for_analysis <- c("Group", "Sex", "Age")
mwi_variables_for_analysis <- c("Group", "Sex", "Age", "sequencing_lane")
nep_variables_for_analysis <- c("Group", "Sex", "Age")
# bangladesh_taxonomic_maaslin <- read_in_maaslin('Bangladesh', groups_to_analyse, bang_variables_for_analysis, 'metaphlan')
malawi_taxonomic_maaslin <- read_in_maaslin('Malawi', groups_to_analyse, mwi_variables_for_analysis, 'metaphlan')
nepal_taxonomic_maaslin <- read_in_maaslin('Nepal', groups_to_analyse, nep_variables_for_analysis, 'metaphlan')
# bangladesh_taxonomic_maaslin_filtered <- filter_taxonomic_maaslin(bangladesh_taxonomic_maaslin)
malawi_taxonomic_maaslin_filtered <- filter_taxonomic_maaslin(malawi_taxonomic_maaslin)
nepal_taxonomic_maaslin_filtered <- filter_taxonomic_maaslin(nepal_taxonomic_maaslin)
# bangladesh_maaslin_stats <- basic_maaslin_stats(bangladesh_taxonomic_maaslin_filtered, 'Bangladesh', bang_variables_for_analysis, groups_to_analyse)
malawi_maaslin_stats <- basic_maaslin_stats(malawi_taxonomic_maaslin_filtered, 'Malawi', mwi_variables_for_analysis, groups_to_analyse)
nepal_maaslin_stats <- basic_maaslin_stats(nepal_taxonomic_maaslin_filtered, 'Nepal', nep_variables_for_analysis, groups_to_analyse)
# / bangladesh_maaslin_stats$volcano_plot
malawi_maaslin_stats$volcano_plot / nepal_maaslin_stats$volcano_plot

nrow(malawi_maaslin_stats$maaslin_results_sig)
## [1] 117
# nrow(bangladesh_maaslin_stats$maaslin_results_sig)
nrow(nepal_maaslin_stats$maaslin_results_sig)
## [1] 1
We’re not including the bangladesh samples in this analysis, because
the bangladesh carriers were processed differently (extracted without
being frozen).
There are no species significantly associated with carrier status in
all Mal and Nep, so not including the combine.
# combined_results <- run_combine_maaslins(bangladesh_taxonomic_maaslin_filtered, malawi_taxonomic_maaslin_filtered, bang_variables_for_analysis, mwi_variables_for_analysis, groups_to_analyse, 'metaphlan', maaslin_taxonomic_output_root_folder)
# only one species significantly associated in Nepal.
# there are no species consistently associated across all three sites
# bang_mal_nep <- list(bangladesh_taxonomic_maaslin_filtered, malawi_taxonomic_maaslin_filtered, nepal_taxonomic_maaslin_filtered)
# combined_results <- run_combine_maaslins(bang_mal_nep, c('_bang', '_mwi', '_nep'), mwi_variables_for_analysis, groups_to_analyse, 'metaphlan', maaslin_taxonomic_output_root_folder)
# View(combined_results$positive_coef)
# View(combined_results$negative_coef)
# there are no species associated in both bang and nep
# bang_nep <- list(bangladesh_taxonomic_maaslin_filtered, nepal_taxonomic_maaslin_filtered)
# combined_results <- run_combine_maaslins(bang_nep, c('_bang', '_nep'), bang_variables_for_analysis, groups_to_analyse, 'metaphlan', maaslin_taxonomic_output_root_folder)
# View(combined_results$positive_coef)
# View(combined_results$negative_coef)
# there are no species associated in both mal and nep
# mal_nep <- list(malawi_taxonomic_maaslin_filtered, nepal_taxonomic_maaslin_filtered)
# combined_results <- run_combine_maaslins(bang_nep, c('_mal', '_nep'), bang_variables_for_analysis, groups_to_analyse, 'metaphlan', maaslin_taxonomic_output_root_folder)
# View(combined_results$positive_coef)
# View(combined_results$negative_coef)
# bang_mal <- list(bangladesh_taxonomic_maaslin_filtered, malawi_taxonomic_maaslin_filtered)
# combined_results <- run_combine_maaslins(bang_mal, c('_bang', '_mal'), mwi_variables_for_analysis, groups_to_analyse, 'metaphlan', maaslin_taxonomic_output_root_folder)
# # positive is associated with health
# combined_results$positive_coef %>% filter(grepl('^s', lowest_taxonomic_level)) %>%
# select(!c(metadata, value, N_bang, N.not.0_bang, pval_bang, N_mal, N.not.0_mal, pval_mal)) %>%
# rename(Species = lowest_taxonomic_level, `Coefficient Bangladesh` = coef_bang, `Standard Error Bangladesh` = stderr_bang, `Q-value Bangladesh` = qval_bang, `Coefficient Malawi` = coef_mal, `Standard Error Malawi` = stderr_mal, `Q-value Malawi` = qval_mal) %>%
# dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>%
# kbl() %>% kable_styling()
# # negative is associated with carrier
# combined_results$negative_coef %>% filter(grepl('^s', lowest_taxonomic_level)) %>%
# select(!c(metadata, value, N_bang, N.not.0_bang, pval_bang, N_mal, N.not.0_mal, pval_mal)) %>%
# rename(Species = lowest_taxonomic_level, `Coefficient Bangladesh` = coef_bang, `Standard Error Bangladesh` = stderr_bang, `Q-value Bangladesh` = qval_bang, `Coefficient Malawi` = coef_mal, `Standard Error Malawi` = stderr_mal, `Q-value Malawi` = qval_mal) %>%
# dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>%
# kbl() %>% kable_styling()
# nrow(combined_results$mwi_maaslin_only)
# nrow(combined_results$bang_maaslin_only)
functional
groups
# bang_variables_for_analysis <- c("Group", "Sex", "Age")
mwi_variables_for_analysis <- c("Group", "Sex", "Age", "sequencing_lane")
# nep_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions")
groups_to_analyse <- c('Carrier', 'Control_HealthySerosurvey')
# bang_func_carr_health_maaslin <- read_in_maaslin('Bangladesh', groups_to_analyse, bang_variables_for_analysis, 'bigmap')
mal_func_carr_health_maaslin <- read_in_maaslin('Malawi', groups_to_analyse, mwi_variables_for_analysis, 'bigmap')
# combined_results <- run_combine_maaslins(bang_func_carr_health_maaslin, mal_func_carr_health_maaslin, bang_variables_for_analysis, mwi_variables_for_analysis, groups_to_analyse, 'bigmap', maaslin_functional_output_root_folder)
# bang_mal <- list(bang_func_carr_health_maaslin, mal_func_carr_health_maaslin)
# combined_results <- run_combine_maaslins(bang_mal, c('_bang', '_mal'), mwi_variables_for_analysis, groups_to_analyse, 'bigmap', maaslin_functional_output_root_folder)
# # positive is associated with health
# # combined_results$positive_coef %>% kbl() %>% kable_styling()
# combined_results$positive_coef %>%
# select(!c(metadata, value, N_bang, N.not.0_bang, pval_bang, N_mal, N.not.0_mal, pval_mal)) %>%
# rename(`Coefficient Bangladesh` = coef_bang, `Standard Error Bangladesh` = stderr_bang, `Q-value Bangladesh` = qval_bang, `Coefficient Malawi` = coef_mal, `Standard Error Malawi` = stderr_mal, `Q-value Malawi` = qval_mal) %>%
# dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>%
# kbl() %>% kable_styling()
# # negative is associated with carrier
# # combined_results$negative_coef %>% kbl() %>% kable_styling()
# combined_results$negative_coef %>%
# select(!c(metadata, value, N_bang, N.not.0_bang, pval_bang, N_mal, N.not.0_mal, pval_mal)) %>%
# rename(`Coefficient Bangladesh` = coef_bang, `Standard Error Bangladesh` = stderr_bang, `Q-value Bangladesh` = qval_bang, `Coefficient Malawi` = coef_mal, `Standard Error Malawi` = stderr_mal, `Q-value Malawi` = qval_mal) %>%
# dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>%
# kbl() %>% kable_styling()
# # print out a nicely formatted table of the count of the MGCs
# combined_results$positive_coef %>% mutate(split_species = str_split(Species, '_')) %>% mutate(genus = sapply(split_species, "[[", 1)) %>% mutate(specific = sapply(split_species, "[[", 2)) %>% mutate(clean_species = paste(genus, specific, sep = ' ')) %>% select(-split_species, -genus, -specific) %>% relocate(clean_species, .after = Species) %>% group_by(MGC_class) %>% arrange(Species) %>% summarise(n = n(), Species = paste(clean_species, collapse = ', ')) %>% arrange(desc(n), MGC_class) %>% kbl() %>% kable_styling()
# combined_results$negative_coef %>% mutate(split_species = str_split(Species, '_')) %>% mutate(genus = sapply(split_species, "[[", 1)) %>% mutate(specific = sapply(split_species, "[[", 2)) %>% mutate(clean_species = paste(genus, specific, sep = ' ')) %>% select(-split_species, -genus, -specific) %>% relocate(clean_species, .after = Species) %>% group_by(MGC_class) %>% arrange(Species) %>% summarise(n = n(), Species = paste(clean_species, collapse = ', ')) %>% arrange(desc(n), MGC_class) %>% kbl() %>% kable_styling()
healthy vs acute vs
carriers - taxonomic
groups_to_analyse <- c('Carrier', 'Control_HealthySerosurvey', 'Acute_Typhi')
variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions", "sequencing_lane")
heal_ac_car_taxonomic <- read_in_maaslin('Malawi', groups_to_analyse, variables_for_analysis, 'metaphlan')
heal_ac_car_taxonomic_cln <- filter_taxonomic_maaslin(heal_ac_car_taxonomic) #%>% filter(qval <= 0.05)
# spread heal_ac_car_taxonomic_cln wider so that the coef is on the same row for each feature
heal_ac_car_taxonomic_cln_w <- heal_ac_car_taxonomic_cln %>% pivot_wider(names_from = value, values_from = c(coef, qval), id_cols = c(feature, lowest_taxonomic_level))
changes relative to healthy state
# same_same cant do from here because filtered for qvalue
# 1
up_up <- heal_ac_car_taxonomic_cln_w %>% filter(coef_Acute_Typhi > 0 & coef_Carrier > 0 & qval_Acute_Typhi < 0.05 & qval_Carrier < 0.05) %>% mutate(type = '1.up_up')
print(nrow(up_up))
## [1] 68
# write_csv(up_up, file.path(maaslin_taxonomic_output_root_folder, 'blantyre.health_acute_carrier.1.up_up.csv'))
# 2
up_down <- heal_ac_car_taxonomic_cln_w %>% filter(coef_Acute_Typhi > 0 & coef_Carrier < 0 & qval_Acute_Typhi < 0.05 & qval_Carrier < 0.05) %>% mutate(type = '2.up_down')
print(nrow(up_down))
## [1] 0
# write_csv(up_down, file.path(maaslin_taxonomic_output_root_folder, 'blantyre.health_acute_carrier.2.up_down.csv'))
# 3
up_same <- heal_ac_car_taxonomic_cln_w %>% filter(coef_Acute_Typhi > 0 & qval_Acute_Typhi < 0.05 & qval_Carrier >= 0.05) %>% mutate(type = '3.up_same')
print(nrow(up_same))
## [1] 102
# write_csv(up_same, file.path(maaslin_taxonomic_output_root_folder, 'blantyre.health_acute_carrier.3.up_same.csv'))
# 4
down_up <- heal_ac_car_taxonomic_cln_w %>% filter(coef_Acute_Typhi < 0 & coef_Carrier > 0 & qval_Acute_Typhi < 0.05 & qval_Carrier < 0.05) %>% mutate(type = '4.down_up')
print(nrow(down_up))
## [1] 0
# write_csv(down_up, file.path(maaslin_taxonomic_output_root_folder, 'blantyre.health_acute_carrier.4.down_up.csv'))
# 5
down_down <- heal_ac_car_taxonomic_cln_w %>% filter(coef_Acute_Typhi < 0 & coef_Carrier < 0 & qval_Acute_Typhi < 0.05 & qval_Carrier < 0.05) %>% mutate(type = '5.down_down')
print(nrow(down_down))
## [1] 85
# write_csv(down_down, file.path(maaslin_taxonomic_output_root_folder, 'blantyre.health_acute_carrier.5.down_down.csv'))
# 6
down_same <- heal_ac_car_taxonomic_cln_w %>% filter(coef_Acute_Typhi < 0 & qval_Acute_Typhi < 0.05 & qval_Carrier > 0.05) %>% mutate(type = '6.down_same')
print(nrow(down_same))
## [1] 59
# write_csv(down_same, file.path(maaslin_taxonomic_output_root_folder, 'blantyre.health_acute_carrier.6.down_same.csv'))
# 7
same_up <- heal_ac_car_taxonomic_cln_w %>% filter(qval_Acute_Typhi > 0.05 & qval_Carrier < 0.05 & coef_Carrier > 0) %>% mutate(type = '7.same_up')
print(nrow(same_up))
## [1] 90
# write_csv(same_up, file.path(maaslin_taxonomic_output_root_folder, 'blantyre.health_acute_carrier.7.same_up.csv'))
# 8
same_down <- heal_ac_car_taxonomic_cln_w %>% filter(qval_Acute_Typhi > 0.05 & qval_Carrier < 0.05 & coef_Carrier < 0) %>% mutate(type = '8.same_down')
print(nrow(same_down))
## [1] 15
# 9
same_same <- heal_ac_car_taxonomic_cln_w %>% filter(qval_Acute_Typhi >= 0.05 & qval_Carrier >= 0.05) %>% mutate(type = '9.same_same')
print(nrow(same_same))
## [1] 970
all_patterns <- rbind(up_up, up_down, up_same, down_up, down_down, down_same, same_up, same_down, same_same)
write_csv(all_patterns, file.path(maaslin_taxonomic_output_root_folder, 'blantyre.health_acute_carrier.all_patterns.csv'))
up_in_ac_car <- all_patterns %>% filter(type %in% c('1.up_up', '2.up_down', '3.up_same', '4.down_up', '7.same_up'))
down_in_ac_car <- all_patterns %>% filter(type %in% c('5.down_down', '6.down_same', '8.same_down'))
up_in_health_only <- all_patterns %>% filter(type == '5.down_down')
up_in_car_only <- all_patterns %>% filter(type == '7.same_up')
prop_that_is_sgb <- function(input_data) {
x <- input_data %>% filter(str_detect(feature, 'SGB')) %>% summarise(prop = n()/nrow(input_data), n_sgb = n(), total_n = nrow(input_data))
print(x)
return(x)
}
prop_up_in_ac_car <- prop_that_is_sgb(up_in_ac_car)
## # A tibble: 1 × 3
## prop n_sgb total_n
## <dbl> <int> <int>
## 1 0.781 203 260
prop_down_in_ac_car <- prop_that_is_sgb(down_in_ac_car)
## # A tibble: 1 × 3
## prop n_sgb total_n
## <dbl> <int> <int>
## 1 0.258 41 159
prop_up_in_health_only <- prop_that_is_sgb(up_in_health_only)
## # A tibble: 1 × 3
## prop n_sgb total_n
## <dbl> <int> <int>
## 1 0.235 20 85
prop_up_in_car_only <- prop_that_is_sgb(up_in_car_only)
## # A tibble: 1 × 3
## prop n_sgb total_n
## <dbl> <int> <int>
## 1 0.811 73 90
prop.test(x = c(prop_up_in_ac_car$n_sgb, prop_down_in_ac_car$n_sgb), n = c(prop_up_in_ac_car$total_n, prop_down_in_ac_car$total_n), alternative = 'two.sided', correct = FALSE)
##
## 2-sample test for equality of proportions without continuity
## correction
##
## data: c(prop_up_in_ac_car$n_sgb, prop_down_in_ac_car$n_sgb) out of c(prop_up_in_ac_car$total_n, prop_down_in_ac_car$total_n)
## X-squared = 110.92, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.4383352 0.6074800
## sample estimates:
## prop 1 prop 2
## 0.7807692 0.2578616
prop.test(x = c(prop_up_in_health_only$n_sgb, prop_up_in_car_only$n_sgb), n = c(prop_up_in_health_only$total_n, prop_up_in_car_only$total_n), alternative = 'two.sided', correct = FALSE)
##
## 2-sample test for equality of proportions without continuity
## correction
##
## data: c(prop_up_in_health_only$n_sgb, prop_up_in_car_only$n_sgb) out of c(prop_up_in_health_only$total_n, prop_up_in_car_only$total_n)
## X-squared = 58.207, df = 1, p-value = 2.36e-14
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.6969416 -0.4546924
## sample estimates:
## prop 1 prop 2
## 0.2352941 0.8111111
# make a new strataa_metaphlan_data object where the rowname column is an actual column
strataa_metaphlan_data_cln <- strataa_metaphlan_data %>% mutate(feature = rownames(strataa_metaphlan_data))
rownames(strataa_metaphlan_data_cln) <- NULL
strataa_metaphlan_data_l <- strataa_metaphlan_data_cln %>% pivot_longer(!c(feature, lowest_taxonomic_level), names_to = 'sample', values_to = 'abundance') %>% left_join(strataa_metaphlan_metadata %>% mutate(sample = rownames(strataa_metaphlan_metadata)), by = 'sample')
# head(strataa_metaphlan_data_l)
strataa_metaphlan_data_mwi_sp <- strataa_metaphlan_data_l %>% filter(Country == 'Malawi', str_detect(feature, 's__'), str_detect(feature, 't__', negate = TRUE)) %>% mutate(SGB = str_detect(feature, '_SGB'))
head(strataa_metaphlan_data_mwi_sp)
## # A tibble: 6 × 13
## lowest_taxonomic_level feature sample abundance Group Sex Country Age
## <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <dbl>
## 1 s__Prevotella_sp_885 k__Bacteria… 32580… 0 Acut… Fema… Malawi 9.71
## 2 s__Prevotella_sp_885 k__Bacteria… 32580… 0 Acut… Fema… Malawi 3.84
## 3 s__Prevotella_sp_885 k__Bacteria… 32580… 0 Acut… Male Malawi 5.63
## 4 s__Prevotella_sp_885 k__Bacteria… 32580… 0.150 Acut… Fema… Malawi 20.3
## 5 s__Prevotella_sp_885 k__Bacteria… 32580… 0 Acut… Male Malawi 7.51
## 6 s__Prevotella_sp_885 k__Bacteria… 32580… 0 Acut… Fema… Malawi 3.84
## # ℹ 5 more variables:
## # Antibiotics_taken_before_sampling_yes_no_assumptions <chr>,
## # sequencing_lane <chr>, SampleID <chr>, age_bracket <fct>, SGB <lgl>
mwi_sgb_abundance <- strataa_metaphlan_data_mwi_sp %>% group_by(sample, SGB) %>% summarise(total_abundance = sum(abundance)) %>% left_join(strataa_metaphlan_metadata %>% mutate(sample = rownames(strataa_metaphlan_metadata)), by = 'sample') %>% mutate(SGB = ifelse(SGB, 'SGB', 'non-SGB')) %>% mutate(Group = factor(Group, levels = c('Control_HealthySerosurvey', 'Acute_Typhi', 'Carrier')))
mwi_sgb_abundance %>% group_by(SGB, Group) %>% summarise(median_abundance = median(total_abundance), n = n()) %>% kbl() %>% kable_styling()
|
SGB
|
Group
|
median_abundance
|
n
|
|
SGB
|
Control_HealthySerosurvey
|
13.88699
|
40
|
|
SGB
|
Acute_Typhi
|
39.98151
|
34
|
|
SGB
|
Carrier
|
46.65501
|
40
|
|
non-SGB
|
Control_HealthySerosurvey
|
86.11299
|
40
|
|
non-SGB
|
Acute_Typhi
|
60.01852
|
34
|
|
non-SGB
|
Carrier
|
53.34503
|
40
|
# mwi_sgb_abundance <- mwi_sgb_abundance
# do a scatter plot of the abundance of sgb in each group
my_comparisons <- list(c('Acute_Typhi', 'Carrier'), c('Acute_Typhi', 'Control_HealthySerosurvey'), c('Carrier', 'Control_HealthySerosurvey'))
mwi_sgb_abundance_plot <- ggplot(mwi_sgb_abundance %>% filter(SGB == 'SGB'), aes(x = Group, y = total_abundance, color = Group)) +
geom_boxplot() +
stat_compare_means(comparisons = my_comparisons) +
stat_compare_means(label.y = 70) +
theme(legend.position = "none") +
labs(y = 'SGB abundance in metagenome (%)', x = '') +
theme(text = element_text(size=18)) +
scale_x_discrete(labels= c('Control', 'Acute', 'Carrier'))
mwi_sgb_abundance_plot$layers[[2]]$aes_params$textsize <- 4
# which_layers(mwi_sgb_abundance_plot, "GeomSignif")
ggsave(file.path(metaphlan_input_folder, 'mwi_sgb_abundance.png'), mwi_sgb_abundance_plot, width = 8, height = 4.8, units = 'in', dpi = 300)
# heal_ac_car_taxonomic_all_w %>% filter(is.na(qval_Carrier))
to_get_list <- c('k__Bacteria.p__Firmicutes.c__Clostridia.o__Eubacteriales.f__Oscillospiraceae.g__Faecalibacterium.s__Faecalibacterium_prausnitzii', 'k__Bacteria.p__Bacteroidetes.c__Bacteroidia.o__Bacteroidales.f__Barnesiellaceae.g__Barnesiella.s__Barnesiella_intestinihominis', 'k__Bacteria.p__Firmicutes.c__Bacilli.o__Lactobacillales.f__Lactobacillaceae.g__Limosilactobacillus.s__Limosilactobacillus_reuteri', 'k__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Pasteurellales.f__Pasteurellaceae.g__Haemophilus.s__Haemophilus_parainfluenzae', 'k__Bacteria.p__Firmicutes.c__Erysipelotrichia.o__Erysipelotrichales.f__Erysipelotrichaceae.g__Holdemanella.s__Holdemanella_biformis', 'k__Bacteria.p__Firmicutes.c__Clostridia.o__Eubacteriales.f__Lachnospiraceae.g__Mediterraneibacter.s__Ruminococcus_gnavus', 'k__Bacteria.p__Firmicutes.c__Clostridia.o__Eubacteriales.f__Peptostreptococcaceae.g__Paeniclostridium.s__Eubacterium_tenue', 'k__Bacteria.p__Firmicutes.c__Erysipelotrichia.o__Erysipelotrichales.f__Turicibacteraceae.g__Turicibacter.s__Turicibacter_bilis', 'k__Bacteria.p__Firmicutes.c__Erysipelotrichia.o__Erysipelotrichales.f__Turicibacteraceae.g__Turicibacter.s__Turicibacter_sanguinis', 'k__Bacteria.p__Bacteroidetes.c__Bacteroidia.o__Bacteroidales.f__Odoribacteraceae.g__Odoribacter.s__Odoribacter_splanchnicus', 'k__Bacteria.p__Firmicutes.c__Clostridia.o__Eubacteriales.f__Lachnospiraceae.g__Blautia.s__Blautia_faecis', 'k__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Enterobacterales.f__Enterobacteriaceae.g__Escherichia.s__Escherichia_coli', 'k__Bacteria.p__Firmicutes.c__Clostridia.o__Eubacteriales.f__Lachnospiraceae.g__Lachnospiraceae_unclassified.s__Eubacterium_rectale', 'k__Bacteria.p__Bacteroidetes.c__Bacteroidia.o__Bacteroidales.f__Bacteroidaceae.g__Bacteroides.s__Bacteroides_fragilis', 'k__Bacteria.p__Firmicutes.c__Erysipelotrichia.o__Erysipelotrichales.f__Erysipelotrichaceae.g__Holdemanella.s__Holdemanella_porci', 'k__Bacteria.p__Bacteroidetes.c__Bacteroidia.o__Bacteroidales.f__Prevotellaceae.g__Prevotellamassilia.s__Prevotellamassilia_timonensis')
# this one doesn't include the Oscillospiraceae, so will just have to point towards the supplementary table for those.
heal_acc_carr_table <- all_patterns %>% filter(feature %in% to_get_list)
write_csv(heal_acc_carr_table, file.path(maaslin_taxonomic_output_root_folder, 'blantyre.health_acute_carrier.health_acc_carr_table.csv'))